Setup

knitr::opts_chunk$set(warning = FALSE)

source(here::here('code', 'helpers.R'))
library(tidyverse)
library(forcats)
library(cowplot)
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:lubridate':
## 
##     stamp
library(agricolae)
library(ggupset)
library(RColorBrewer)
library(patchwork)
## 
## Attaching package: 'patchwork'
## The following object is masked from 'package:cowplot':
## 
##     align_plots
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## Registered S3 methods overwritten by 'vegan':
##   method      from
##   plot.rda    klaR
##   predict.rda klaR
##   print.rda   klaR
## This is vegan 2.6-4
library(pheatmap)
library(ggradar)
library(broom)
data <- targets::tar_read(merged_all_results)
truth <- targets::tar_read(truth_set_data)
table(data$Type)
## 
##    BLAST100     BLAST97   CustomNBC   DADA2Spec    DADA2Tax  Kraken_0.0 
##       13912       21093      155700      157104      157104      104880 
## Kraken_0.05  Kraken_0.1    Metabuli MMSeqs2_100  MMSeqs2_97      Mothur 
##      104880      104880       73217      109368      109368      152892 
##      Qiime2     VSEARCH 
##       23275      157104

Let’s remove the >0.2 Kraken runs, those are too strict

data <- data |> filter(!Type %in% c('Kraken_0.2', 'Kraken_0.3', 'Kraken_0.4', 'Kraken_0.5', 'Kraken_0.6', 'Kraken_0.7', 'Kraken_0.8', 'Kraken_0.9'))

Check whether any of the data is missing

type_list <- data |># filter(!str_detect(Subject, 'c01')) |> 
  group_by(Type) |>
  summarize('Subject' = list(unique(Subject)))
all_subjects <- unique(data |># filter(!str_detect(Subject, 'c01')) |> 
                         pull(Subject))

for(i in type_list$Type){
  this_subjects <- type_list |> filter(Type == i) |> pull(Subject)
  missing <- setdiff(all_subjects, this_subjects[[1]])
  if (length(missing) > 0){
    print(i)
    print(missing)
  }
}
## [1] "BLAST100"
## [1] "12S_v10_HmmCut.fasta" "16S_v04_HmmCut.fasta" "c01_v03_HmmCut.fasta"
## [1] "BLAST97"
## [1] "12S_v10_HmmCut.fasta" "16S_v04_HmmCut.fasta" "c01_v03_HmmCut.fasta"
## [1] "CustomNBC"
## [1] "c01_v03_HmmCut.fasta"
## [1] "MMSeqs2_100"
## [1] "12S_v10_HmmCut.fasta" "16S_v04_HmmCut.fasta" "c01_v03_HmmCut.fasta"
## [1] "MMSeqs2_97"
## [1] "12S_v10_HmmCut.fasta" "16S_v04_HmmCut.fasta" "c01_v03_HmmCut.fasta"
## [1] "Metabuli"
## [1] "c01_v03_HmmCut.fasta"
## [1] "Mothur"
## [1] "12S_v10_HmmCut.fasta" "16S_v04_HmmCut.fasta" "c01_v03_HmmCut.fasta"
## [1] "Qiime2"
## [1] "c01_v03_HmmCut.fasta"

OK, we can ignore the HmmCut ones.

data |> write_tsv('./results/cleaned_and_filtered_data.tsv.gz')
names(table(data$Query))
##  [1] "KWest_16S_PooledTrue.fa"                                                                               
##  [2] "make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_Lulu_RESULTS_dada2_asv.fa"  
##  [3] "make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_noLulu_RESULTS_dada2_asv.fa"
##  [4] "make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_Lulu_RESULTS_dada2_asv.fa"  
##  [5] "make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_noLulu_RESULTS_dada2_asv.fa"
##  [6] "make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa"       
##  [7] "make_12s_16s_simulated_reads_6-fakeGenes_GreenGenes_RESULTS_dada2_asv.fa"                              
##  [8] "make_12s_16s_simulated_reads_6-fakeGenes_Random_RESULTS_dada2_asv.fa"                                  
##  [9] "make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_12S_RESULTS_dada2_asv.fa"                    
## [10] "make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_16S_RESULTS_dada2_asv.fa"                    
## [11] "make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa"                    
## [12] "make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_12S_RESULTS_dada2_asv.fa"                          
## [13] "make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_16S_RESULTS_dada2_asv.fa"                          
## [14] "make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa"

100 species

Check 12S results

table(data$Subject)
## 
##                                                    12s_v010_final.fasta 
##                                                                   16830 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta 
##                                                                   16415 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta 
##                                                                   16154 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta 
##                                                                   16158 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta 
##                                                                   16252 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta 
##                                                                   16329 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta 
##                                                                   16443 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta 
##                                                                   16095 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta 
##                                                                   16206 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta 
##                                                                   16501 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta 
##                                                                   16287 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta 
##                                                                   16606 
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta 
##                                                                   16516 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta 
##                                                                   16410 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta 
##                                                                   16576 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta 
##                                                                   16315 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta 
##                                                                   16537 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta 
##                                                                   16225 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta 
##                                                                   16388 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta 
##                                                                   16345 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta 
##                                                                   16522 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta 
##                                                                   15956 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta 
##                                                                   15695 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta 
##                                                                   16021 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta 
##                                                                   16039 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta 
##                                                                   15821 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta 
##                                                                   15944 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta 
##                                                                   15793 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta 
##                                                                   15829 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta 
##                                                                   15823 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta 
##                                                                   15866 
##                                                    12S_v10_HmmCut.fasta 
##                                                                    8714 
##                                                     16S_v04_final.fasta 
##                                                                   17914 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta 
##                                                                   16614 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta 
##                                                                   16600 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta 
##                                                                   16975 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta 
##                                                                   16807 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta 
##                                                                   17016 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta 
##                                                                   16929 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta 
##                                                                   16570 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta 
##                                                                   17081 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta 
##                                                                   16405 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta 
##                                                                   16525 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta 
##                                                                   17290 
##  16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta 
##                                                                   17107 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta 
##                                                                   17405 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta 
##                                                                   17070 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta 
##                                                                   17285 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta 
##                                                                   17609 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta 
##                                                                   17193 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta 
##                                                                   17344 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta 
##                                                                   17502 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta 
##                                                                   17399 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta 
##                                                                   16412 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta 
##                                                                   16567 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta 
##                                                                   16534 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta 
##                                                                   16431 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta 
##                                                                   16675 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta 
##                                                                   16414 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta 
##                                                                   16223 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta 
##                                                                   16385 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta 
##                                                                   16250 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta 
##                                                                   16358 
##                                                    16S_v04_HmmCut.fasta 
##                                                                    9590 
##                                                     c01_v03_final.fasta 
##                                                                   12849 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta 
##                                                                   12526 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta 
##                                                                   13147 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta 
##                                                                   12573 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta 
##                                                                   13159 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta 
##                                                                   12347 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta 
##                                                                   12512 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta 
##                                                                   12463 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta 
##                                                                   12913 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta 
##                                                                   12469 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta 
##                                                                   13165 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta 
##                                                                   12695 
##  c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta 
##                                                                   12623 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta 
##                                                                   12596 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta 
##                                                                   12574 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta 
##                                                                   12660 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta 
##                                                                   12686 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta 
##                                                                   12569 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta 
##                                                                   12659 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta 
##                                                                   12812 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta 
##                                                                   12739 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta 
##                                                                   13229 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta 
##                                                                   13188 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta 
##                                                                   13049 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta 
##                                                                   12293 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta 
##                                                                   12250 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta 
##                                                                   12753 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta 
##                                                                   13043 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta 
##                                                                   12881 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta 
##                                                                   12231 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta 
##                                                                   12242 
##                                                    c01_v03_HmmCut.fasta 
##                                                                    6792
table(data$Subject)
## 
##                                                    12s_v010_final.fasta 
##                                                                   16830 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta 
##                                                                   16415 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta 
##                                                                   16154 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta 
##                                                                   16158 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta 
##                                                                   16252 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta 
##                                                                   16329 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta 
##                                                                   16443 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta 
##                                                                   16095 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta 
##                                                                   16206 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta 
##                                                                   16501 
##    12s_v010_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta 
##                                                                   16287 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta 
##                                                                   16606 
## 12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta 
##                                                                   16516 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta 
##                                                                   16410 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta 
##                                                                   16576 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta 
##                                                                   16315 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta 
##                                                                   16537 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta 
##                                                                   16225 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta 
##                                                                   16388 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta 
##                                                                   16345 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta 
##                                                                   16522 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta 
##                                                                   15956 
##  12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta 
##                                                                   15695 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta 
##                                                                   16021 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta 
##                                                                   16039 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta 
##                                                                   15821 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta 
##                                                                   15944 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta 
##                                                                   15793 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta 
##                                                                   15829 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta 
##                                                                   15823 
##   12s_v010_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta 
##                                                                   15866 
##                                                    12S_v10_HmmCut.fasta 
##                                                                    8714 
##                                                     16S_v04_final.fasta 
##                                                                   17914 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta 
##                                                                   16614 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta 
##                                                                   16600 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta 
##                                                                   16975 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta 
##                                                                   16807 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta 
##                                                                   17016 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta 
##                                                                   16929 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta 
##                                                                   16570 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta 
##                                                                   17081 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta 
##                                                                   16405 
##     16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta 
##                                                                   16525 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta 
##                                                                   17290 
##  16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta 
##                                                                   17107 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta 
##                                                                   17405 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta 
##                                                                   17070 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta 
##                                                                   17285 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta 
##                                                                   17609 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta 
##                                                                   17193 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta 
##                                                                   17344 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta 
##                                                                   17502 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta 
##                                                                   17399 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta 
##                                                                   16412 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta 
##                                                                   16567 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta 
##                                                                   16534 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta 
##                                                                   16431 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta 
##                                                                   16675 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta 
##                                                                   16414 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta 
##                                                                   16223 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta 
##                                                                   16385 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta 
##                                                                   16250 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta 
##                                                                   16358 
##                                                    16S_v04_HmmCut.fasta 
##                                                                    9590 
##                                                     c01_v03_final.fasta 
##                                                                   12849 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta 
##                                                                   12526 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta 
##                                                                   13147 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta 
##                                                                   12573 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta 
##                                                                   13159 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta 
##                                                                   12347 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta 
##                                                                   12512 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta 
##                                                                   12463 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta 
##                                                                   12913 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta 
##                                                                   12469 
##     c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta 
##                                                                   13165 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta 
##                                                                   12695 
##  c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta 
##                                                                   12623 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta 
##                                                                   12596 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta 
##                                                                   12574 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta 
##                                                                   12660 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta 
##                                                                   12686 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta 
##                                                                   12569 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta 
##                                                                   12659 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta 
##                                                                   12812 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta 
##                                                                   12739 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta 
##                                                                   13229 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta 
##                                                                   13188 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta 
##                                                                   13049 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta 
##                                                                   12293 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta 
##                                                                   12250 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta 
##                                                                   12753 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta 
##                                                                   13043 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta 
##                                                                   12881 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta 
##                                                                   12231 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta 
##                                                                   12242 
##                                                    c01_v03_HmmCut.fasta 
##                                                                    6792
twelveS_data <- data |> filter(Subject == '12s_v010_final.fasta')
sixteenS_data <- data |> filter(Subject == '16S_v04_final.fasta')
co1_data <- data |> filter(Subject == 'c01_v03_final.fasta')
table(twelveS_data$Query)
## 
##                                                                                KWest_16S_PooledTrue.fa 
##                                                                                                   4263 
##   make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_Lulu_RESULTS_dada2_asv.fa 
##                                                                                                   1321 
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_noLulu_RESULTS_dada2_asv.fa 
##                                                                                                   1321 
##   make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_Lulu_RESULTS_dada2_asv.fa 
##                                                                                                   1095 
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_noLulu_RESULTS_dada2_asv.fa 
##                                                                                                   1095 
##        make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa 
##                                                                                                   1068 
##                               make_12s_16s_simulated_reads_6-fakeGenes_GreenGenes_RESULTS_dada2_asv.fa 
##                                                                                                    990 
##                                   make_12s_16s_simulated_reads_6-fakeGenes_Random_RESULTS_dada2_asv.fa 
##                                                                                                   1000 
##                     make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_12S_RESULTS_dada2_asv.fa 
##                                                                                                    316 
##                     make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_16S_RESULTS_dada2_asv.fa 
##                                                                                                    299 
##                     make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa 
##                                                                                                    281 
##                           make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_12S_RESULTS_dada2_asv.fa 
##                                                                                                   1343 
##                           make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_16S_RESULTS_dada2_asv.fa 
##                                                                                                   1226 
##                           make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa 
##                                                                                                   1212
table(sixteenS_data$Query)
## 
##                                                                                KWest_16S_PooledTrue.fa 
##                                                                                                   4928 
##   make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_Lulu_RESULTS_dada2_asv.fa 
##                                                                                                   1230 
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_noLulu_RESULTS_dada2_asv.fa 
##                                                                                                   1194 
##   make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_Lulu_RESULTS_dada2_asv.fa 
##                                                                                                   1306 
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_noLulu_RESULTS_dada2_asv.fa 
##                                                                                                   1306 
##        make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa 
##                                                                                                   1068 
##                               make_12s_16s_simulated_reads_6-fakeGenes_GreenGenes_RESULTS_dada2_asv.fa 
##                                                                                                    990 
##                                   make_12s_16s_simulated_reads_6-fakeGenes_Random_RESULTS_dada2_asv.fa 
##                                                                                                   1000 
##                     make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_12S_RESULTS_dada2_asv.fa 
##                                                                                                    307 
##                     make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_16S_RESULTS_dada2_asv.fa 
##                                                                                                    341 
##                     make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa 
##                                                                                                    281 
##                           make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_12S_RESULTS_dada2_asv.fa 
##                                                                                                   1274 
##                           make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_16S_RESULTS_dada2_asv.fa 
##                                                                                                   1477 
##                           make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa 
##                                                                                                   1212
table(sixteenS_data$Subject)
## 
## 16S_v04_final.fasta 
##               17914
table(co1_data$Query)
## 
##                                                                                KWest_16S_PooledTrue.fa 
##                                                                                                   2768 
##   make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_Lulu_RESULTS_dada2_asv.fa 
##                                                                                                    795 
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_noLulu_RESULTS_dada2_asv.fa 
##                                                                                                    795 
##   make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_Lulu_RESULTS_dada2_asv.fa 
##                                                                                                    799 
## make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_noLulu_RESULTS_dada2_asv.fa 
##                                                                                                    799 
##        make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa 
##                                                                                                   1315 
##                               make_12s_16s_simulated_reads_6-fakeGenes_GreenGenes_RESULTS_dada2_asv.fa 
##                                                                                                    792 
##                                   make_12s_16s_simulated_reads_6-fakeGenes_Random_RESULTS_dada2_asv.fa 
##                                                                                                    800 
##                     make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_12S_RESULTS_dada2_asv.fa 
##                                                                                                    192 
##                     make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFlow_16S_RESULTS_dada2_asv.fa 
##                                                                                                    216 
##                     make_12s_16s_simulated_reads_7-Lutjanids_Mock_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa 
##                                                                                                    346 
##                           make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_12S_RESULTS_dada2_asv.fa 
##                                                                                                    818 
##                           make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_16S_RESULTS_dada2_asv.fa 
##                                                                                                    899 
##                           make_12s_16s_simulated_reads_8-Rottnest_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa 
##                                                                                                   1515
twelveS_data_vs_12S_100 <- twelveS_data |> filter(Query == 'make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_Lulu_RESULTS_dada2_asv.fa')
sixteenS_data_vs_16S_100 <- sixteenS_data |> filter(Query == 'make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_Lulu_RESULTS_dada2_asv.fa' )
co1_data_vs_co1_100 <- co1_data |> filter(Query == 'make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa')
twelveS_data_vs_12S_100 |> select(Type, species) |> filter(species != 'dropped' &
                                                             !is.na(species)) |>
  group_by(Type) |> count(species) |> summarise(n = n()) |>
  ggplot(aes(x = Type, y = n, fill = n)) + geom_col() + coord_flip() + 
  theme_minimal() + 
  ylab('Count') + 
  ggtitle('12S: Species-level hits per classifier')

twelveS_data_vs_12S_100 |> select(Type, genus) |> filter(genus != 'dropped' &
                                                             !is.na(genus)) |>
  group_by(Type) |> count(genus) |> summarise(n = n()) |>
  ggplot(aes(x = Type, y = n, fill = n)) + geom_col() + coord_flip() + 
  theme_minimal() + 
  ylab('Count') + 
  ggtitle('12S: Genus-level hits per classifier')

twelveS_truth <- truth |> filter(Query == 'make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_Lulu_RESULTS_dada2_asv.fa') |> select(OTU, family, species) |> rename(True_OTU = OTU, True_family = family, True_species = species)
head(twelveS_truth)
## # A tibble: 6 × 3
##   True_OTU True_family    True_species            
##   <chr>    <chr>          <chr>                   
## 1 ASV_1    Syngnathidae   Phyllopteryx taeniolatus
## 2 ASV_2    Carcharhinidae Glyphis garricki        
## 3 ASV_3    Mullidae       Parupeneus barberinus   
## 4 ASV_4    Holocentridae  Myripristis vittata     
## 5 ASV_5    Scincidae      Tropidophorus hainanus  
## 6 ASV_6    Anatidae       Aythya nyroca
twelveS_data_vs_12S_100 |> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
  mutate(Correct = True_species == species) |> 
  filter(species != 'dropped' & !is.na(species)) |> 
  group_by(Type) |> count(Correct) |> 
  ggplot(aes(x = fct_rev(fct_reorder2(Type, Correct, n)), fill = Correct, y = n))+ geom_col() +
  coord_flip() + theme_minimal() + xlab('Type') + 
  ggtitle('12S: Correct and incorrect species-level classifications (absolute)') +
  scale_fill_manual(values = c("#E69F00", "#56B4E9", "#009E73",
          "#F0E442", "#0072B2", "#D55E00", "#CC79A7"))

cols <- c('Correct species' = "#009E73", 'Correct genus'="#56B4E9", 'Correct family' = "#0072B2", 'Incorrect family' = "#E69F00", 'Incorrect genus'="#F0E442", 'Incorrect species'="#D55E00", 'No hit'= "#CC79A7")
twelve_s_relative_plot <- twelveS_data_vs_12S_100 |> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
  separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|> 
  mutate(species = na_if(species, 'dropped')) |> 
  mutate(genus = na_if(genus, 'dropped')) |> 

  mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
                              !is.na(species) & True_species != species ~ 'Incorrect species',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
                              !is.na(family) & True_family == family ~ 'Correct family',
                              !is.na(family) & True_family != family ~ 'Incorrect family',
                              TRUE ~ NA)) |> 
  group_by(Type) |>
  count(CorrectSpecies) |> 
  mutate(total = sum(n, na.rm=TRUE)) |> 
  mutate(missing = 99 - total) |> 
  group_modify(~ add_row(.x)) |> 
  group_modify(~ {
    .x |> mutate(new_col= max(missing, na.rm=TRUE)) |> 
      mutate(n = case_when(is.na(CorrectSpecies) & is.na(missing) ~ new_col,
                                  TRUE ~ n)) |> 
      select(-new_col)
  } ) |> 
  mutate(total = 99) |> 
  mutate(perc = n / total * 100) |> 
  mutate(CorrectSpecies = replace_na(CorrectSpecies, 'No hit')) |> 
  mutate(CorrectSpecies = factor(CorrectSpecies, rev(c('Correct species', 'Correct genus', 'Correct family', 'Incorrect family', 'Incorrect genus', 'Incorrect species', 'No hit')))) |> 
  ggplot(aes(x = fct_rev(fct_reorder2(Type, CorrectSpecies, n)), fill = CorrectSpecies, y = perc))+ 
  geom_col() +
  coord_flip() + 
  theme_minimal() + 
  ylab('Percentage') + xlab('Type') +
  ggtitle('12S: Correct and incorrect species-level classifications (relative)') +
  scale_fill_manual(name = 'Outcome', values = cols, breaks=names(cols))
twelve_s_relative_plot

### Calculate Upset-based species sightings

type_list <- twelveS_data_vs_12S_100 |> select(Type, species) |> unique() |> filter(!is.na(species) & species != 'dropped') |> 
  group_by(species) |> 
  summarize('Type' = list(Type))

a <- type_list |>
  ggplot(aes(x = Type)) + 
  geom_bar() +
  scale_x_upset(n_intersections = 12) +
  ggtitle('12S: Shared species') +
  ylab('Species')
a

type_list <- twelveS_data_vs_12S_100 |> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
  filter(species == True_species) |> 
  filter(species != 'dropped' & !is.na(species)) |> 
  select(Type, species) |> unique() |> 
  group_by(species) |> 
  summarize('Type' = list(Type))

b <- type_list |>
  ggplot(aes(x = Type)) + 
  geom_bar() +
  scale_x_upset(n_intersections = 12) +
  ggtitle('12S: Shared correct species') +
  ylab('Species')
b

type_list <- twelveS_data_vs_12S_100 |> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
  filter(species != True_species) |> 
  filter(species != 'dropped' & !is.na(species)) |> 
  select(Type, species) |> unique() |> 
  group_by(species) |> 
  summarize('Type' = list(Type))

c <- type_list |>
  ggplot(aes(x = Type)) + 
  geom_bar() +
  scale_x_upset(n_intersections = 12) +
  ggtitle('12S: Shared incorrect species') +
  ylab('Species')
c

a + b + c & ylim(c(0, 30)) & 
  theme(
  # Hide panel borders and remove grid lines
  panel.border = element_blank(),
  panel.grid.major = element_blank(),
  panel.background = element_blank(),
  panel.grid.minor = element_blank(),
  #panel.grid.major.y = element_line(),
  # Change axis line
  axis.line = element_line(colour = "black")
  )

Calculate FP/TP/TN/FN

add_scores <- function(twelveS_data_vs_12S_100_with_MaxTruth, twelveS_truth ) {
  twelveS_data_vs_12S_100_with_MaxTruth|> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
  separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|> 
  mutate(species = na_if(species, 'dropped')) |> 
  mutate(genus = na_if(genus, 'dropped')) |> 

  mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
                              !is.na(species) & True_species != species ~ 'Incorrect species',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
                              !is.na(family) & True_family == family ~ 'Correct family',
                              !is.na(family) & True_family != family ~ 'Incorrect family',
                              TRUE ~ NA)) |> 
  group_by(Type) |>
  summarise(TP = sum(str_detect(CorrectSpecies, pattern='Correct species'), na.rm=TRUE),
            FP = sum(str_detect(CorrectSpecies, pattern = 'Incorrect species'), na.rm=TRUE),
            TN = sum(str_detect(replace_na(CorrectSpecies,'NA'), pattern='NA') & is.na(True_species), na.rm=TRUE),
            FN = sum(is.na(species) & !is.na(True_species))) |> 
  mutate(sums = TP + FP + TN + FN) |> 
  mutate(missing = 99 - sums) |> 
  mutate(FN = FN + missing) |> 
  mutate(sums = TP + FP + TN + FN) |> 
  select(-c(missing, sums))
}
scores <- add_scores(twelveS_data_vs_12S_100, twelveS_truth)
scores <- scores |> rowwise() |> mutate(Recall = recall(TP, FN), Precision = precision(TP, FP),
                              f1 = f1(Precision, Recall), f0.5 = f0.5(Precision, Recall), accuracy = accuracy(TP, FP, FN, TN))
scores
## # A tibble: 14 × 10
## # Rowwise: 
##    Type           TP    FP    TN    FN Recall Precision    f1  f0.5 accuracy
##    <chr>       <int> <int> <int> <dbl>  <dbl>     <dbl> <dbl> <dbl>    <dbl>
##  1 BLAST100       59     6     0    34  0.634     0.908 0.747 0.836    0.596
##  2 BLAST97        48     8     0    43  0.527     0.857 0.653 0.762    0.485
##  3 CustomNBC      42    19     0    38  0.525     0.689 0.596 0.648    0.424
##  4 DADA2Spec      59     6     0    34  0.634     0.908 0.747 0.836    0.596
##  5 DADA2Tax       12    16     0    71  0.145     0.429 0.216 0.308    0.121
##  6 Kraken_0.0     54    14     0    31  0.635     0.794 0.706 0.756    0.545
##  7 Kraken_0.05    51     7     0    41  0.554     0.879 0.68  0.787    0.515
##  8 Kraken_0.1     47     5     0    47  0.5       0.904 0.644 0.778    0.475
##  9 MMSeqs2_100    59     6     0    34  0.634     0.908 0.747 0.836    0.596
## 10 MMSeqs2_97     59    13     0    27  0.686     0.819 0.747 0.789    0.596
## 11 Metabuli       56    13     0    30  0.651     0.812 0.723 0.773    0.566
## 12 Mothur         41    20     0    38  0.519     0.672 0.586 0.635    0.414
## 13 Qiime2         29    22     0    48  0.377     0.569 0.453 0.516    0.293
## 14 VSEARCH        40    15     0    44  0.476     0.727 0.576 0.658    0.404
twelveS_scoreS_plot <- scores |> select(-c(TP, FP, FN, TN)) |> pivot_longer(-Type, names_to='Score') |>  ggplot(aes(x = fct_rev(fct_reorder(Type, value)), y = value, group=Score, color = Score, fill =Score)) + geom_line() + ylim(c(0, 1.05))  + theme_minimal_hgrid()+ theme(axis.text.x = element_text( angle = 45, hjust = 1)) + ylab('Score') + xlab('Tool') + ggtitle('12S scores')
twelveS_scoreS_plot

Scores radar

scores |> select(-c(TP, FP, TN, FN)) |> 
  rename('group' = 'Type') |> 
  ggradar()

Scores heatmap

Let’s also make a heatmap from that

b <- scores$Type
m <- scores |> select(-Type) |> select(accuracy, Recall, Precision, f1, f0.5) |> as.matrix()
rownames(m) <- b
pheatmap(m, cluster_cols=FALSE, display_numbers = TRUE,
         color = colorRampPalette(brewer.pal(n = 7, name =
  "RdYlGn"))(100))

b <- scores$Type
m <- scores |> select(-Type) |> select(TP, FP, FN) |> as.matrix()
rownames(m) <- b
pheatmap(m, cluster_cols=FALSE, display_numbers = TRUE,
         color = colorRampPalette(brewer.pal(n = 7, name =
  "RdYlGn"))(100))

Metaclassifier

table(twelveS_data_vs_12S_100$Type)
## 
##    BLAST100     BLAST97   CustomNBC   DADA2Spec    DADA2Tax  Kraken_0.0 
##          86          95          99          99          99          99 
## Kraken_0.05  Kraken_0.1    Metabuli MMSeqs2_100  MMSeqs2_97      Mothur 
##          99          99          99          99          99          99 
##      Qiime2     VSEARCH 
##          51          99

First, we count the per-OTU species hits

twelveS_data_vs_12S_100_maxCount <- twelveS_data_vs_12S_100 |>  
  mutate(species = na_if(species, 'dropped')) |> 
  filter(!is.na(species)) |> 
  #filter(! Type %in% c('Mothur', 'VSEARCH', 'Kraken_0.2', 'Mothur', 'Metabuli', 'NBC', 'BLAST97', 'Kraken_0.0', 'Kraken_0.1')) |> 
  group_by(Query, Subject, OTU) |> 
  count(species) |> 
  # double check the truth
  #left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |> 
  #mutate(Truth = True_species == species) |> 
  # pull out the per-group highest n
  filter( n > 4) |> 
  slice_max(n, n=1, with_ties = FALSE) |> 
  mutate(Type = 'MaxCount', .before = 'Query') |> 
  select(-n)
twelveS_data_vs_12S_100_maxCount
## # A tibble: 70 × 5
## # Groups:   Query, Subject, OTU [70]
##    Type     Query                                          Subject OTU   species
##    <chr>    <chr>                                          <chr>   <chr> <chr>  
##  1 MaxCount make_12s_16s_simulated_reads_5-BetterDatabase… 12s_v0… ASV_1 Phyllo…
##  2 MaxCount make_12s_16s_simulated_reads_5-BetterDatabase… 12s_v0… ASV_… Cirrip…
##  3 MaxCount make_12s_16s_simulated_reads_5-BetterDatabase… 12s_v0… ASV_… Sterco…
##  4 MaxCount make_12s_16s_simulated_reads_5-BetterDatabase… 12s_v0… ASV_… Carcha…
##  5 MaxCount make_12s_16s_simulated_reads_5-BetterDatabase… 12s_v0… ASV_… Hemigy…
##  6 MaxCount make_12s_16s_simulated_reads_5-BetterDatabase… 12s_v0… ASV_… Ctenoc…
##  7 MaxCount make_12s_16s_simulated_reads_5-BetterDatabase… 12s_v0… ASV_… Daptio…
##  8 MaxCount make_12s_16s_simulated_reads_5-BetterDatabase… 12s_v0… ASV_… Engrau…
##  9 MaxCount make_12s_16s_simulated_reads_5-BetterDatabase… 12s_v0… ASV_… Bathyr…
## 10 MaxCount make_12s_16s_simulated_reads_5-BetterDatabase… 12s_v0… ASV_… Tripho…
## # ℹ 60 more rows
twelveS_data_vs_12S_100_with_MaxTruth <- twelveS_data_vs_12S_100 |> 
  bind_rows(twelveS_data_vs_12S_100_maxCount) #|>  
  #filter(! Type %in% c('Mothur', 'VSEARCH', 'Kraken_0.2', 'Mothur', 'Metabuli', 'NBC', 'BLAST97', 'Kraken_0.0', 'Kraken_0.1')) 
maxTruth_scores <-  add_scores(twelveS_data_vs_12S_100_with_MaxTruth, twelveS_truth )
maxTruth_scores <- maxTruth_scores |> rowwise() |> mutate(Recall = recall(TP, FN), Precision = precision(TP, FP),
                              f1 = f1(Precision, Recall), f0.5 = f0.5(Precision, Recall), accuracy = accuracy(TP, FP, FN, TN))
maxTruth_scoreS_plot <- maxTruth_scores |> select(-c(TP, FP, FN, TN)) |> pivot_longer(-Type, names_to='Score') |>  ggplot(aes(x = fct_rev(fct_reorder(Type, value)), y = value, group=Score, color = Score, fill =Score)) + geom_line() + ylim(c(0, 1))  + theme_minimal_hgrid()+ theme(axis.text.x = element_text( angle = 45, hjust = 1)) + ylab('Score') + xlab('Tool') + geom_point() + ggtitle('12S scores')
maxTruth_scoreS_plot

Interestingly, just counting the labels is not good! It performs worse than BLAST.

16S

sixteenS_truth <- truth |> filter(Query == 'make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_Lulu_RESULTS_dada2_asv.fa') |> select(OTU, family, species) |> rename(True_OTU = OTU, True_family = family, True_species = species)
head(sixteenS_truth)
## # A tibble: 6 × 3
##   True_OTU True_family    True_species            
##   <chr>    <chr>          <chr>                   
## 1 ASV_1    Syngnathidae   Phyllopteryx taeniolatus
## 2 ASV_2    Carcharhinidae Glyphis garricki        
## 3 ASV_3    Merlucciidae   Merluccius productus    
## 4 ASV_4    Mullidae       Parupeneus barberinus   
## 5 ASV_5    Syngnathidae   Hippocampus algiricus   
## 6 ASV_6    Eleotridae     Bostrychus sinensis
sixteenS_relative_plot <- sixteenS_data_vs_16S_100 |> left_join(sixteenS_truth, by = c('OTU' = 'True_OTU')) |>
 separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|> 
  mutate(species = na_if(species, 'dropped')) |> 
  mutate(genus = na_if(genus, 'dropped')) |> 

  mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
                              !is.na(species) & True_species != species ~ 'Incorrect species',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
                              !is.na(family) & True_family == family ~ 'Correct family',
                              !is.na(family) & True_family != family ~ 'Incorrect family',
                              TRUE ~ NA)) |> 
  group_by(Type) |> 
  count(CorrectSpecies) |> 
  mutate(total = sum(n)) |> 
  mutate(missing = 99 - total) |> 
  group_modify(~ add_row(.x)) |> 
  group_modify(~ {
    .x |> mutate(new_col= max(missing, na.rm=TRUE)) |> 
      mutate(n = case_when(is.na(CorrectSpecies) & is.na(missing) ~ new_col,
                                  TRUE ~ n)) |> 
      select(-new_col)
  } ) |> 
  mutate(total = 99) |> 
  mutate(perc = n / total * 100) |> 
  mutate(CorrectSpecies = replace_na(CorrectSpecies, 'No hit')) |> 
  mutate(CorrectSpecies = factor(CorrectSpecies, rev(c('Correct species', 'Correct genus', 'Correct family', 'Incorrect family', 'Incorrect genus', 'Incorrect species', 'No hit')))) |> 
  tidyr::complete(CorrectSpecies, fill = list(n=0, total = 99, missing = NA, perc = 0)) |>  
  ggplot(aes(x = fct_rev(fct_reorder2(Type, CorrectSpecies, n)), fill = CorrectSpecies, y = perc))+ 
  geom_col() +
  coord_flip() + 
  theme_minimal() + 
  ylab('Percentage') + xlab('Type') +
  ggtitle('16S: Correct and incorrect species-level classifications (relative)') +
  scale_fill_manual(name = 'Outcome', values = cols, breaks=names(cols))
sixteenS_relative_plot 

Calculate scores

scores <- add_scores(sixteenS_data_vs_16S_100, sixteenS_truth)
scores
## # A tibble: 14 × 5
##    Type           TP    FP    TN    FN
##    <chr>       <int> <int> <int> <dbl>
##  1 BLAST100       51     0     0    48
##  2 BLAST97        48     5     0    46
##  3 CustomNBC      52    10     0    37
##  4 DADA2Spec      50     1     0    48
##  5 DADA2Tax       21    30     0    48
##  6 Kraken_0.0     52    15     0    32
##  7 Kraken_0.05    48    13     0    38
##  8 Kraken_0.1     45    10     0    44
##  9 MMSeqs2_100    51     0     0    48
## 10 MMSeqs2_97     60     5     0    34
## 11 Metabuli       53    18     0    28
## 12 Mothur         50    14     0    35
## 13 Qiime2         42    15     0    42
## 14 VSEARCH        43    12     0    44
scores <- scores |> rowwise() |> mutate(Recall = recall(TP, FN), Precision = precision(TP, FP),
                              f1 = f1(Precision, Recall), f0.5 = f0.5(Precision, Recall), accuracy = accuracy(TP, FP, FN, TN))
scores
## # A tibble: 14 × 10
## # Rowwise: 
##    Type           TP    FP    TN    FN Recall Precision    f1  f0.5 accuracy
##    <chr>       <int> <int> <int> <dbl>  <dbl>     <dbl> <dbl> <dbl>    <dbl>
##  1 BLAST100       51     0     0    48  0.515     1     0.68  0.842    0.515
##  2 BLAST97        48     5     0    46  0.511     0.906 0.653 0.784    0.485
##  3 CustomNBC      52    10     0    37  0.584     0.839 0.689 0.772    0.525
##  4 DADA2Spec      50     1     0    48  0.510     0.980 0.671 0.828    0.505
##  5 DADA2Tax       21    30     0    48  0.304     0.412 0.35  0.385    0.212
##  6 Kraken_0.0     52    15     0    32  0.619     0.776 0.689 0.739    0.525
##  7 Kraken_0.05    48    13     0    38  0.558     0.787 0.653 0.727    0.485
##  8 Kraken_0.1     45    10     0    44  0.506     0.818 0.625 0.728    0.455
##  9 MMSeqs2_100    51     0     0    48  0.515     1     0.68  0.842    0.515
## 10 MMSeqs2_97     60     5     0    34  0.638     0.923 0.755 0.847    0.606
## 11 Metabuli       53    18     0    28  0.654     0.746 0.697 0.726    0.535
## 12 Mothur         50    14     0    35  0.588     0.781 0.671 0.733    0.505
## 13 Qiime2         42    15     0    42  0.5       0.737 0.596 0.673    0.424
## 14 VSEARCH        43    12     0    44  0.494     0.782 0.606 0.700    0.434
sixteenS_score_plot <- scores |> select(-c(TP, FP, FN, TN)) |> pivot_longer(-Type, names_to='Score') |>  ggplot(aes(x = fct_rev(fct_reorder(Type, value)), y = value, group=Score, color = Score, fill =Score)) + geom_line() + ylim(c(0, 1))  + theme_minimal_hgrid() + theme(axis.text.x = element_text( angle = 45, hjust = 1)) + ylab('Score') + xlab('Tool') +  ggtitle('16S scores')
sixteenS_score_plot 

b <- scores$Type
m <- scores |> select(-Type) |> select(accuracy, Recall, Precision, f1, f0.5) |> as.matrix()
rownames(m) <- b
pheatmap(m, cluster_cols=FALSE, display_numbers = TRUE,
         color = colorRampPalette(brewer.pal(n = 7, name =
  "RdYlGn"))(100))

b <- scores$Type
m <- scores |> select(-Type) |> select(TP, FP, FN) |> as.matrix()
rownames(m) <- b
pheatmap(m, cluster_cols=FALSE, display_numbers = TRUE,
         color = colorRampPalette(brewer.pal(n = 7, name =
  "RdYlGn"))(100))

### Calculate Upset-based species sightings

type_list <- sixteenS_data_vs_16S_100  |> select(Type, species) |> unique() |> filter(!is.na(species) & species != 'dropped') |> 
  group_by(species) |> 
  summarize('Type' = list(Type))

a <- type_list |>
  ggplot(aes(x = Type)) + 
  geom_bar() +
  scale_x_upset(n_intersections = 12) +
  ggtitle('16S: Shared species') +
  ylab('Species')
a

type_list <- sixteenS_data_vs_16S_100 |> left_join(sixteenS_truth,  by = c('OTU' = 'True_OTU')) |>
  filter(species == True_species) |> 
  filter(species != 'dropped' & !is.na(species)) |> 
  select(Type, species) |> unique() |> 
  group_by(species) |> 
  summarize('Type' = list(Type))

b <- type_list |>
  ggplot(aes(x = Type)) + 
  geom_bar() +
  scale_x_upset(n_intersections = 12) +
  ggtitle('16S: Shared correct species') +
  ylab('Species')
b

type_list <- sixteenS_data_vs_16S_100 |> left_join(sixteenS_truth, by = c('OTU' = 'True_OTU')) |>
  filter(species != True_species) |> 
  filter(species != 'dropped' & !is.na(species)) |> 
  select(Type, species) |> unique() |> 
  group_by(species) |> 
  summarize('Type' = list(Type))

c <- type_list |>
  ggplot(aes(x = Type)) + 
  geom_bar() +
  scale_x_upset(n_intersections = 12) +
  ggtitle('16S: Shared incorrect species') +
  ylab('Species')
c

a + b + c & ylim(c(0, 20)) & 
  theme(
  # Hide panel borders and remove grid lines
  panel.border = element_blank(),
  panel.grid.major = element_blank(),
  panel.background = element_blank(),
  panel.grid.minor = element_blank(),
  #panel.grid.major.y = element_line(),
  # Change axis line
  axis.line = element_line(colour = "black")
  )

CO1

CO1_truth <- truth |> filter(Query == 'make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa') |> select(OTU, family, species) |> rename(True_OTU = OTU, True_family = family, True_species = species)
head(CO1_truth)
## # A tibble: 6 × 3
##   True_OTU True_family    True_species            
##   <chr>    <chr>          <chr>                   
## 1 ASV_1    Syngnathidae   Phycodurus eques        
## 2 ASV_2    Syngnathidae   Phyllopteryx taeniolatus
## 3 ASV_3    Carcharhinidae Glyphis garricki        
## 4 ASV_4    Syngnathidae   dropped                 
## 5 ASV_5    Mullidae       Parupeneus heptacantha  
## 6 ASV_6    Otariidae      Zalophus wollebaeki
CO1_relative_plot <- co1_data_vs_co1_100 |> left_join(CO1_truth, by = c('OTU' = 'True_OTU')) |>
 separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|> 
  mutate(species = na_if(species, 'dropped')) |> 
  mutate(genus = na_if(genus, 'dropped')) |> 

  mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
                              !is.na(species) & True_species != species ~ 'Incorrect species',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
                              !is.na(family) & True_family == family ~ 'Correct family',
                              !is.na(family) & True_family != family ~ 'Incorrect family',
                              TRUE ~ NA)) |> 
  group_by(Type) |> 
  count(CorrectSpecies) |> 
  mutate(total = sum(n)) |> 
  mutate(missing = 100 - total) |> 
  group_modify(~ add_row(.x)) |> 
  group_modify(~ {
    .x |> mutate(new_col= max(missing, na.rm=TRUE)) |> 
      mutate(n = case_when(is.na(CorrectSpecies) & is.na(missing) ~ new_col,
                                  TRUE ~ n)) |> 
      select(-new_col)
  } ) |> 
  mutate(total = 99) |> 
  mutate(perc = n / total * 100) |> 
  mutate(CorrectSpecies = replace_na(CorrectSpecies, 'No hit')) |> 
  mutate(CorrectSpecies = factor(CorrectSpecies, rev(c('Correct species', 'Correct genus', 'Correct family', 'Incorrect family', 'Incorrect genus', 'Incorrect species', 'No hit')))) |> 
  tidyr::complete(CorrectSpecies, fill = list(n=0, total = 99, missing = NA, perc = 0)) |>  
  ggplot(aes(x = fct_rev(fct_reorder2(Type, CorrectSpecies, n)), fill = CorrectSpecies, y = perc))+ 
  geom_col() +
  coord_flip() + 
  theme_minimal() + 
  ylab('Percentage') + xlab('Type') +
  ggtitle('CO1: Correct and incorrect species-level classifications (relative)') +
  scale_fill_manual(name = 'Outcome', values = cols, breaks=names(cols))
CO1_relative_plot 

Calculate scores

scores <- add_scores(co1_data_vs_co1_100, CO1_truth)
scores
## # A tibble: 14 × 5
##    Type           TP    FP    TN    FN
##    <chr>       <int> <int> <int> <dbl>
##  1 BLAST100       58     4     0    37
##  2 BLAST97        50     3     0    46
##  3 CustomNBC      63    22     0    14
##  4 DADA2Spec      57     4     0    38
##  5 DADA2Tax       10    37     0    52
##  6 Kraken_0.0      0     0     0    99
##  7 Kraken_0.05     0     0     0    99
##  8 Kraken_0.1      0     0     0    99
##  9 MMSeqs2_100    57     4     0    38
## 10 MMSeqs2_97     61     6     0    32
## 11 Metabuli       24     3     0    72
## 12 Mothur         64    22     0    13
## 13 Qiime2         51    17     0    31
## 14 VSEARCH        60    20     0    19
scores <- scores |> rowwise() |> mutate(Recall = recall(TP, FN), Precision = precision(TP, FP),
                              f1 = f1(Precision, Recall), f0.5 = f0.5(Precision, Recall), accuracy = accuracy(TP, FP, FN, TN))
scores
## # A tibble: 14 × 10
## # Rowwise: 
##    Type           TP    FP    TN    FN Recall Precision    f1  f0.5 accuracy
##    <chr>       <int> <int> <int> <dbl>  <dbl>     <dbl> <dbl> <dbl>    <dbl>
##  1 BLAST100       58     4     0    37  0.611     0.935 0.739 0.845    0.586
##  2 BLAST97        50     3     0    46  0.521     0.943 0.671 0.812    0.505
##  3 CustomNBC      63    22     0    14  0.818     0.741 0.778 0.755    0.636
##  4 DADA2Spec      57     4     0    38  0.6       0.934 0.731 0.841    0.576
##  5 DADA2Tax       10    37     0    52  0.161     0.213 0.183 0.2      0.101
##  6 Kraken_0.0      0     0     0    99  0         0     0     0        0    
##  7 Kraken_0.05     0     0     0    99  0         0     0     0        0    
##  8 Kraken_0.1      0     0     0    99  0         0     0     0        0    
##  9 MMSeqs2_100    57     4     0    38  0.6       0.934 0.731 0.841    0.576
## 10 MMSeqs2_97     61     6     0    32  0.656     0.910 0.762 0.845    0.616
## 11 Metabuli       24     3     0    72  0.25      0.889 0.390 0.588    0.242
## 12 Mothur         64    22     0    13  0.831     0.744 0.785 0.760    0.646
## 13 Qiime2         51    17     0    31  0.622     0.75  0.68  0.720    0.515
## 14 VSEARCH        60    20     0    19  0.759     0.75  0.755 0.752    0.606
CO1_score_plot <- scores |> select(-c(TP, FP, FN, TN)) |> pivot_longer(-Type, names_to='Score') |>  ggplot(aes(x = fct_rev(fct_reorder(Type, value)), y = value, group=Score, color = Score, fill =Score)) + geom_line() + ylim(c(0, 1))  + theme_minimal_hgrid() + theme(axis.text.x = element_text( angle = 45, hjust = 1)) + ylab('Score') + xlab('Tool') +  ggtitle('CO1 scores')
CO1_score_plot 

b <- scores$Type
m <- scores |> select(-Type) |> select(accuracy, Recall, Precision, f1, f0.5) |> as.matrix()
rownames(m) <- b
pheatmap(m, cluster_cols=FALSE, display_numbers = TRUE,
         color = colorRampPalette(brewer.pal(n = 7, name =
  "RdYlGn"))(100))

b <- scores$Type
m <- scores |> select(-Type) |> select(TP, FP, FN) |> as.matrix()
rownames(m) <- b
pheatmap(m, cluster_cols=FALSE, display_numbers = TRUE,
         color = colorRampPalette(brewer.pal(n = 7, name =
  "RdYlGn"))(100))

### Calculate Upset-based species sightings

type_list <- co1_data_vs_co1_100  |> select(Type, species) |> unique() |> filter(!is.na(species) & species != 'dropped') |> 
  group_by(species) |> 
  summarize('Type' = list(Type))

a <- type_list |>
  ggplot(aes(x = Type)) + 
  geom_bar() +
  scale_x_upset(n_intersections = 12) +
  ggtitle('CO1: Shared species') +
  ylab('Species')
a

type_list <- co1_data_vs_co1_100 |> left_join(CO1_truth,  by = c('OTU' = 'True_OTU')) |>
  filter(species == True_species) |> 
  filter(species != 'dropped' & !is.na(species)) |> 
  select(Type, species) |> unique() |> 
  group_by(species) |> 
  summarize('Type' = list(Type))

b <- type_list |>
  ggplot(aes(x = Type)) + 
  geom_bar() +
  scale_x_upset(n_intersections = 12) +
  ggtitle('CO1: Shared correct species') +
  ylab('Species')
b

type_list <- co1_data_vs_co1_100 |> left_join(CO1_truth, by = c('OTU' = 'True_OTU')) |>
  filter(species != True_species) |> 
  filter(species != 'dropped' & !is.na(species)) |> 
  select(Type, species) |> unique() |> 
  group_by(species) |> 
  summarize('Type' = list(Type))

c <- type_list |>
  ggplot(aes(x = Type)) + 
  geom_bar() +
  scale_x_upset(n_intersections = 12) +
  ggtitle('CO1: Shared incorrect species') +
  ylab('Species')
c

a + b + c & ylim(c(0, 20)) & 
  theme(
  # Hide panel borders and remove grid lines
  panel.border = element_blank(),
  panel.grid.major = element_blank(),
  panel.background = element_blank(),
  panel.grid.minor = element_blank(),
  #panel.grid.major.y = element_line(),
  # Change axis line
  axis.line = element_line(colour = "black")
  )

16S/12S/CO1 relative plots

sixteenS_relative_plot / twelve_s_relative_plot / CO1_relative_plot

Let’s make without titles, but with a/b

(sixteenS_relative_plot + ggtitle('') + ylab(''))/ (twelve_s_relative_plot + ggtitle('')) / (CO1_relative_plot + ggtitle('')) + 
  plot_annotation(tag_levels = c('A','B')) +
  plot_layout(guides = 'collect')

(sixteenS_score_plot +geom_point() + theme(axis.title.x = element_blank()))/ (twelveS_scoreS_plot + geom_point()) / (CO1_score_plot + geom_point())

12S exclusion databases

twelve_exclusions <- data |> filter(str_starts(Subject, '12s_v010_final.fasta_Taxonomies.CountedFams.txt_')) |> 
  filter(Query == 'make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_12S_Lulu_RESULTS_dada2_asv.fa')
twelve_exclusions_split <- twelve_exclusions |> separate(Subject, into = c('before', 'hit'), sep='.txt_') |> 
  # get rid of leftover non-subsetted databases
  filter(!is.na(hit)) |> 
  separate(hit, into=c('Database', 'after'), sep='_subset')
twelve_exclusions_split_averaged <- twelve_exclusions_split |> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
  separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|> 
  mutate(species = na_if(species, 'dropped')) |> 
  mutate(genus = na_if(genus, 'dropped')) |> 

  mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
                              !is.na(species) & True_species != species ~ 'Incorrect species',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
                              !is.na(family) & True_family == family ~ 'Correct family',
                              !is.na(family) & True_family != family ~ 'Incorrect family',
                              TRUE ~ NA)) |> 
  group_by(Type, Database, after) |>
  summarise(TP = sum(str_detect(CorrectSpecies, pattern='Correct species'), na.rm=TRUE),
            FP = sum(str_detect(CorrectSpecies, pattern = 'Incorrect species'), na.rm=TRUE),
            TN = sum(str_detect(replace_na(CorrectSpecies,'NA'), pattern='NA') & is.na(True_species), na.rm=TRUE),
            FN = sum(is.na(species) & !is.na(True_species))) |> 
  mutate(sums = TP + FP + TN + FN) |> 
  mutate(missing = 99 - sums) |> 
  mutate(FN = FN + missing) |> 
  mutate(sums = TP + FP + TN + FN) |> 
  select(-c(missing, sums)) |> 
  group_by(Type, Database) |> 
  summarise(mean_TP = mean(TP),
            mean_FP = mean(FP),
            mean_TN = mean(TN),
            mean_FN = mean(FN)) |> 
  rowwise() |> 
  mutate(Recall = recall(mean_TP, mean_FN), 
         Precision = precision(mean_TP, mean_FP),
         f1 = f1(Precision, Recall),
         f0.5 = f0.5(Precision, Recall),
         accuracy = accuracy(mean_TP, mean_FP, mean_FN, mean_TN)) 
## `summarise()` has grouped output by 'Type', 'Database'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.
twelve_exclusions_split_averaged <- twelve_exclusions_split_averaged |> 
  mutate(Database = case_when ( Database == 'fifty' ~ '50%',
                               Database == 'seventy' ~ '30%', # we switch from seventy INCLUDED to 30 EXCLUDED
                               Database == 'thirty' ~ '70%',
                               TRUE ~ Database))
f1_pl <- twelve_exclusions_split_averaged |> 
  ggplot(aes(x = Type, y = f1, group = Database, color = Database, fill = Database)) + 
  geom_point() +
  geom_line() +
  theme_minimal()
f0.5_pl <- twelve_exclusions_split_averaged |> 
  ggplot(aes(x = Type, y = f0.5, group = Database, color = Database, fill = Database)) + 
  geom_point() +
  geom_line() +
  theme_minimal()
Precision_pl <- twelve_exclusions_split_averaged |> 
  ggplot(aes(x = Type, y = Precision, group = Database, color = Database, fill = Database)) + 
  geom_point() +
  geom_line() +
  theme_minimal()
Recall_pl <- twelve_exclusions_split_averaged |> 
  ggplot(aes(x = Type, y = Recall, group = Database, color = Database, fill = Database)) + 
  geom_point() +
  geom_line() +
  theme_minimal()
(f1_pl / f0.5_pl / Precision_pl / Recall_pl) +  plot_layout(guides = 'collect')

Lets zero in on the Precision and make boxplots with jitter dots

un_summarised_twelve <- twelve_exclusions_split |> left_join(twelveS_truth, by = c('OTU' = 'True_OTU')) |>
  separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|> 
  mutate(species = na_if(species, 'dropped')) |> 
  mutate(genus = na_if(genus, 'dropped')) |> 

  mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
                              !is.na(species) & True_species != species ~ 'Incorrect species',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
                              !is.na(family) & True_family == family ~ 'Correct family',
                              !is.na(family) & True_family != family ~ 'Incorrect family',
                              TRUE ~ NA)) |> 
  group_by(Type, Database, after) |>
  summarise(TP = sum(str_detect(CorrectSpecies, pattern='Correct species'), na.rm=TRUE),
            FP = sum(str_detect(CorrectSpecies, pattern = 'Incorrect species'), na.rm=TRUE),
            TN = sum(str_detect(replace_na(CorrectSpecies,'NA'), pattern='NA') & is.na(True_species), na.rm=TRUE),
            FN = sum(is.na(species) & !is.na(True_species))) |> 
  mutate(sums = TP + FP + TN + FN) |> 
  mutate(missing = 99 - sums) |> 
  mutate(FN = FN + missing) |> 
  mutate(sums = TP + FP + TN + FN) |> 
  select(-c(missing, sums)) |> 
  rowwise() |> 
  mutate(Recall = recall(TP, FN), 
         Precision = precision(TP, FP),
         f1 = f1(Precision, Recall),
         f0.5 = f0.5(Precision, Recall),
         accuracy = accuracy(TP, FP, FN, TN)) |> 
  mutate(Database = case_when ( Database == 'fifty' ~ '50%',
                               Database == 'seventy' ~ '30%',
                               Database == 'thirty' ~ '70%',
                               TRUE ~ Database))
## `summarise()` has grouped output by 'Type', 'Database'. You can override using
## the `.groups` argument.
un_summarised_twelve |> group_by(Type, Database) |> mutate(best = max(mean(Precision))) |>
  ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database),  color=Database, y = Precision)) +
  geom_boxplot(outlier.shape = NA) +
  coord_flip() +
  theme_minimal() +
  xlab('Type') +
  ylab('Precision') +
  geom_point(position = position_jitterdodge(), alpha=0.5)

un_summarised_twelve  |> group_by(Type, Database) |> mutate(best = max(mean(f0.5))) |>
  ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = f0.5)) +
  geom_boxplot(outlier.shape = NA) + 
  coord_flip() + 
  theme_minimal() + 
  xlab('Type') +
  ylab('f0.5') +
  ylim(c(0, 1)) +
  geom_point(position = position_jitterdodge(), alpha=0.5)

un_summarised_twelve  |> group_by(Type, Database) |> mutate(best = max(mean(Recall))) |>
  ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = Recall)) +
  geom_boxplot(outlier.shape = NA) + 
  coord_flip() + 
  theme_minimal() + 
  xlab('Type') +
  ylab('f0.5') +
  ylim(c(0, 1)) +
  geom_point(position = position_jitterdodge(), alpha=0.5)

un_summarised_twelve  |> 
  filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Metabuli',  'Mothur')) |> 
  ggplot(aes(x=Database, y = Precision, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) + 
  geom_boxplot() +
  labs(fill='Type') + 
  ylab('Precision') + 
  theme_minimal()

false_positives <- un_summarised_twelve  |> 
  filter(Type %in% c('BLAST100', 'Kraken_0.0','Metabuli', 'Kraken_0.1', 'MMSeqs2')) |> 
  ggplot(aes(x=Database, y = FP/99*100, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) + 
  geom_boxplot(outlier.shape=NA) +
  labs(fill='Type') + 
  ylab('False positives (%)') + 
  geom_point(aes(color=Type), 
             position = position_jitterdodge(jitter.width = 0.2), 
             alpha=0.5,
             show.legend = FALSE)+
  theme_minimal()
false_positives

true_positives <- un_summarised_twelve  |> 
  filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Metabuli',  'Kraken_0.1', 'MMSeqs2')) |> 
  ggplot(aes(x=Database, y = TP/99*100, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) + 
  geom_boxplot(outlier.shape=NA) +
  labs(fill='Type') + 
  ylab('True positives (%)') + 
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE)+
  theme_minimal()
true_positives

false_positives/ true_positives + plot_layout(guides = 'collect') & coord_flip()

### Phylogenetic diversity

We can also easily calculate alpha diversity across these tools, as alpha diversity is the number of species. We treat classifiers/Types as sites.

spec_summarised <- twelve_exclusions_split |> 
  group_by(Type, Query, Database, after) |> 
  mutate(Database = case_when ( Database == 'fifty' ~ '50%',
                               Database == 'seventy' ~ '30%',
                               Database == 'thirty' ~ '70%',
                               TRUE ~ Database)) |> 
  filter(!is.na(species)) |> 
  summarise(`Alpha diversity index` = length(unique(species)))
## `summarise()` has grouped output by 'Type', 'Query', 'Database'. You can
## override using the `.groups` argument.
spec_summarised |> 
  ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) + 
  geom_boxplot() +
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE) + 
  facet_wrap(~Database) + coord_flip() + theme_minimal()

Let’s also do not all of the classifiers

spec_summarised |> 
  filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Metabuli',  'Kraken_0.1','MMSeqs2', 'Mothur')) |> 
  ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) + 
  geom_boxplot() +
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE) + 
  facet_wrap(~Database) + coord_flip() + theme_minimal()

a <- spec_summarised |> 
  filter(Type %in% c('BLAST100', 'BLAST97', 'Kraken_0.0','Metabuli',  'Kraken_0.1','MMSeqs2', 'Mothur')) |> 
  group_by(Database) |> 
  arrange(Database) |> 
  group_map(~aov(`Alpha diversity index` ~ Type, data=.))

names(a) <- spec_summarised |> arrange(Database) |> pull(Database) |> unique() # I don't like this :(
a
## $`30%`
## Call:
##    aov(formula = `Alpha diversity index` ~ Type, data = .)
## 
## Terms:
##                     Type Residuals
## Sum of Squares  6544.683  1186.300
## Deg. of Freedom        5        54
## 
## Residual standard error: 4.687059
## Estimated effects may be unbalanced
## 
## $`50%`
## Call:
##    aov(formula = `Alpha diversity index` ~ Type, data = .)
## 
## Terms:
##                     Type Residuals
## Sum of Squares  10960.15   1365.50
## Deg. of Freedom        5        54
## 
## Residual standard error: 5.028622
## Estimated effects may be unbalanced
## 
## $`70%`
## Call:
##    aov(formula = `Alpha diversity index` ~ Type, data = .)
## 
## Terms:
##                     Type Residuals
## Sum of Squares  16836.95    655.90
## Deg. of Freedom        5        54
## 
## Residual standard error: 3.485154
## Estimated effects may be unbalanced
groupslist <- list()

for(key in names(a)) {
  print(key)
  groupslist[[key]] <- HSD.test(a[[key]], 'Type')$groups|> 
    as_tibble(rownames = 'Type') |> 
    select(-`Alpha diversity index`)
}
## [1] "30%"
## [1] "50%"
## [1] "70%"
groups_df <- bind_rows(groupslist, .id='Database')
twelve_s_exclusions_fig <- spec_summarised |> 
  filter(Type %in% c('BLAST100', 'BLAST97', 'Kraken_0.0','Metabuli',  'Kraken_0.1','MMSeqs2', 'Mothur')) |> 
  left_join(groups_df, by = c('Database', 'Type')) |> 
  ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) + 
  geom_boxplot(outlier.shape=NA) +
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE) + 
  facet_wrap(~Database) + 
  geom_text(aes(x = Type, y = max(`Alpha diversity index`) + 2, label = groups),
            #col = 'black',
            size = 4) +
  #coord_flip() + 
  theme_minimal() +
  theme(axis.text.x = element_text( angle = 90, hjust = 1)) +
  guides(fill="none")
twelve_s_exclusions_fig

twelve_spec_summarised <- spec_summarised
twelve_spec_summarised$gene <- '12S'

16S exclusion databases

sixteen_exclusions <- data |> filter(str_starts(Subject, '16S_v04_final.fasta_Taxonomies.')) |> 
  filter(Query == 'make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_16S_Lulu_RESULTS_dada2_asv.fa')
table(sixteen_exclusions$Subject)
## 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta 
##                                                                   1216 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta 
##                                                                   1153 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta 
##                                                                   1178 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta 
##                                                                   1222 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta 
##                                                                   1226 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta 
##                                                                   1197 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta 
##                                                                   1179 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta 
##                                                                   1181 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta 
##                                                                   1163 
##    16S_v04_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta 
##                                                                   1192 
##  16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta 
##                                                                   1186 
## 16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta 
##                                                                   1243 
##  16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta 
##                                                                   1255 
##  16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta 
##                                                                   1238 
##  16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta 
##                                                                   1245 
##  16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta 
##                                                                   1263 
##  16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta 
##                                                                   1239 
##  16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta 
##                                                                   1245 
##  16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta 
##                                                                   1249 
##  16S_v04_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta 
##                                                                   1248 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta 
##                                                                   1123 
##  16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta 
##                                                                   1162 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta 
##                                                                   1141 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta 
##                                                                   1127 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta 
##                                                                   1158 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta 
##                                                                   1129 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta 
##                                                                   1126 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta 
##                                                                   1149 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta 
##                                                                   1128 
##   16S_v04_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta 
##                                                                   1144
sixteen_exclusions_split <- sixteen_exclusions |> separate(Subject, into = c('before', 'hit'), sep='.txt_') |> 
  # get rid of leftover non-subsetted databases
  filter(!is.na(hit)) |> 
  separate(hit, into=c('Database', 'after'), sep='_subset')
sixteen_exclusions_split_averaged <- sixteen_exclusions_split |> left_join(sixteenS_truth, by = c('OTU' = 'True_OTU')) |>
  separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|> 
  mutate(species = na_if(species, 'dropped')) |> 
  mutate(genus = na_if(genus, 'dropped')) |> 

  mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
                              !is.na(species) & True_species != species ~ 'Incorrect species',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
                              !is.na(family) & True_family == family ~ 'Correct family',
                              !is.na(family) & True_family != family ~ 'Incorrect family',
                              TRUE ~ NA)) |> 
  group_by(Type, Database, after) |>
  summarise(TP = sum(str_detect(CorrectSpecies, pattern='Correct species'), na.rm=TRUE),
            FP = sum(str_detect(CorrectSpecies, pattern = 'Incorrect species'), na.rm=TRUE),
            TN = sum(str_detect(replace_na(CorrectSpecies,'NA'), pattern='NA') & is.na(True_species), na.rm=TRUE),
            FN = sum(is.na(species) & !is.na(True_species))) |> 
  mutate(sums = TP + FP + TN + FN) |> 
  mutate(missing = 99 - sums) |> 
  mutate(FN = FN + missing) |> 
  mutate(sums = TP + FP + TN + FN) |> 
  select(-c(missing, sums)) |> 
  group_by(Type, Database) |> 
  summarise(mean_TP = mean(TP),
            mean_FP = mean(FP),
            mean_TN = mean(TN),
            mean_FN = mean(FN)) |> 
  rowwise() |> 
  mutate(Recall = recall(mean_TP, mean_FN), 
         Precision = precision(mean_TP, mean_FP),
         f1 = f1(Precision, Recall),
         f0.5 = f0.5(Precision, Recall),
         accuracy = accuracy(mean_TP, mean_FP, mean_FN, mean_TN)) 
## `summarise()` has grouped output by 'Type', 'Database'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.
sixteen_exclusions_split_averaged <- sixteen_exclusions_split_averaged |> 
  mutate(Database = case_when ( Database == 'fifty' ~ '50%',
                               Database == 'seventy' ~ '30%',
                               Database == 'thirty' ~ '70%',
                               TRUE ~ Database))
f1_pl <- sixteen_exclusions_split_averaged |> 
  ggplot(aes(x = Type, y = f1, group = Database, color = Database, fill = Database)) + 
  geom_point() +
  geom_line() +
  theme_minimal()
f0.5_pl <- sixteen_exclusions_split_averaged |> 
  ggplot(aes(x = Type, y = f0.5, group = Database, color = Database, fill = Database)) + 
  geom_point() +
  geom_line() +
  theme_minimal()
Precision_pl <- sixteen_exclusions_split_averaged |> 
  ggplot(aes(x = Type, y = Precision, group = Database, color = Database, fill = Database)) + 
  geom_point() +
  geom_line() +
  theme_minimal()
Recall_pl <- sixteen_exclusions_split_averaged |> 
  ggplot(aes(x = Type, y = Recall, group = Database, color = Database, fill = Database)) + 
  geom_point() +
  geom_line() +
  theme_minimal()
(f1_pl / f0.5_pl / Precision_pl / Recall_pl) +  plot_layout(guides = 'collect')

Lets zero in on the Precision and make boxplots with jitter dots

un_summarised_sixteen <- sixteen_exclusions_split |> left_join(sixteenS_truth, by = c('OTU' = 'True_OTU')) |>
  separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|> 
  mutate(species = na_if(species, 'dropped')) |> 
  mutate(genus = na_if(genus, 'dropped')) |> 

  mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
                              !is.na(species) & True_species != species ~ 'Incorrect species',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
                              !is.na(family) & True_family == family ~ 'Correct family',
                              !is.na(family) & True_family != family ~ 'Incorrect family',
                              TRUE ~ NA)) |> 
  group_by(Type, Database, after) |>
  summarise(TP = sum(str_detect(CorrectSpecies, pattern='Correct species'), na.rm=TRUE),
            FP = sum(str_detect(CorrectSpecies, pattern = 'Incorrect species'), na.rm=TRUE),
            TN = sum(str_detect(replace_na(CorrectSpecies,'NA'), pattern='NA') & is.na(True_species), na.rm=TRUE),
            FN = sum(is.na(species) & !is.na(True_species))) |> 
  mutate(sums = TP + FP + TN + FN) |> 
  mutate(missing = 99 - sums) |> 
  mutate(FN = FN + missing) |> 
  mutate(sums = TP + FP + TN + FN) |> 
  select(-c(missing, sums)) |> 
  rowwise() |> 
  mutate(Recall = recall(TP, FN), 
         Precision = precision(TP, FP),
         f1 = f1(Precision, Recall),
         f0.5 = f0.5(Precision, Recall),
         accuracy = accuracy(TP, FP, FN, TN)) |> 
  mutate(Database = case_when ( Database == 'fifty' ~ '50%',
                               Database == 'seventy' ~ '30%',
                               Database == 'thirty' ~ '70%',
                               TRUE ~ Database))
## `summarise()` has grouped output by 'Type', 'Database'. You can override using
## the `.groups` argument.
un_summarised_sixteen |> group_by(Type, Database) |> mutate(best = max(mean(Precision, na.rm=TRUE))) |>
  ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database),  color=Database, y = Precision)) +
  geom_boxplot(outlier.shape = NA) +
  coord_flip() +
  theme_minimal() +
  xlab('Type') +
  ylab('Precision') +
  geom_point(position = position_jitterdodge(), alpha=0.5)

un_summarised_sixteen  |> group_by(Type, Database) |> mutate(best = max(mean(f0.5, na.rm=TRUE))) |>
  ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = f0.5)) +
  geom_boxplot(outlier.shape = NA) + 
  coord_flip() + 
  theme_minimal() + 
  xlab('Type') +
  ylab('f0.5') +
  ylim(c(0, 1)) +
  geom_point(position = position_jitterdodge(), alpha=0.5)

un_summarised_sixteen  |> group_by(Type, Database) |> mutate(best = max(mean(Recall))) |>
  ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = Recall)) +
  geom_boxplot(outlier.shape = NA) + 
  coord_flip() + 
  theme_minimal() + 
  xlab('Type') +
  ylab('f0.5') +
  ylim(c(0, 1)) +
  geom_point(position = position_jitterdodge(), alpha=0.5)

un_summarised_sixteen  |> 
  filter(Type %in% c('BLAST100', 'Kraken_0.0',  'Metabuli',  'Mothur')) |> 
  ggplot(aes(x=Database, y = Precision, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) + 
  geom_boxplot() +
  labs(fill='Type') + 
  ylab('Precision') + 
  theme_minimal()

false_positives <- un_summarised_sixteen  |> 
  filter(Type %in% c('BLAST100', 'Kraken_0.0',  'Metabuli',  'Kraken_0.1', 'MMSeqs2')) |> 
  ggplot(aes(x=Database, y = FP/99*100, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) + 
  geom_boxplot(outlier.shape=NA) +
  labs(fill='Type') + 
  ylab('False positives (%)') + 
  geom_point(aes(color=Type), 
             position = position_jitterdodge(jitter.width = 0.2), 
             alpha=0.5,
             show.legend = FALSE)+
  theme_minimal()
false_positives

true_positives <- un_summarised_sixteen  |> 
  filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Metabuli', 'Kraken_0.1', 'MMSeqs2')) |> 
  ggplot(aes(x=Database, y = TP/99*100, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) + 
  geom_boxplot(outlier.shape=NA) +
  labs(fill='Type') + 
  ylab('True positives (%)') + 
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE)+
  theme_minimal()
true_positives

false_positives/ true_positives + plot_layout(guides = 'collect') & coord_flip()

### Phylogenetic diversity

We can also easily calculate alpha diversity across these tools, as alpha diversity is the number of species. We treat classifiers/Types as sites.

spec_summarised <- sixteen_exclusions_split |> 
  group_by(Type, Query, Database, after) |> 
  mutate(Database = case_when ( Database == 'fifty' ~ '50%',
                               Database == 'seventy' ~ '30%',
                               Database == 'thirty' ~ '70%',
                               TRUE ~ Database)) |> 
  filter(!is.na(species)) |> 
  summarise(`Alpha diversity index` = length(unique(species)))
## `summarise()` has grouped output by 'Type', 'Query', 'Database'. You can
## override using the `.groups` argument.
spec_summarised |> 
  ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) + 
  geom_boxplot() +
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE) + 
  facet_wrap(~Database) + coord_flip() + theme_minimal()

Let’s also do not all of the classifiers

spec_summarised |> 
  filter(Type %in% c('BLAST100', 'Kraken_0.0', 'Metabuli',  'Kraken_0.1','MMSeqs2', 'Mothur')) |> 
  ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) + 
  geom_boxplot() +
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE) + 
  facet_wrap(~Database) + coord_flip() + theme_minimal()

a <- spec_summarised |> 
  filter(Type %in% c('BLAST100', 'BLAST97', 'Kraken_0.0',  'Metabuli',  'Kraken_0.1','MMSeqs2', 'Mothur')) |> 
  group_by(Database) |> 
  arrange(Database) |> 
  group_map(~aov(`Alpha diversity index` ~ Type, data=.))

names(a) <- spec_summarised |> arrange(Database) |> pull(Database) |> unique() # I don't like this :(
a
## $`30%`
## Call:
##    aov(formula = `Alpha diversity index` ~ Type, data = .)
## 
## Terms:
##                     Type Residuals
## Sum of Squares  9595.627   371.322
## Deg. of Freedom        5        53
## 
## Residual standard error: 2.6469
## Estimated effects may be unbalanced
## 
## $`50%`
## Call:
##    aov(formula = `Alpha diversity index` ~ Type, data = .)
## 
## Terms:
##                     Type Residuals
## Sum of Squares  15002.08   1554.90
## Deg. of Freedom        5        54
## 
## Residual standard error: 5.366046
## Estimated effects may be unbalanced
## 
## $`70%`
## Call:
##    aov(formula = `Alpha diversity index` ~ Type, data = .)
## 
## Terms:
##                     Type Residuals
## Sum of Squares  18225.48    700.70
## Deg. of Freedom        5        54
## 
## Residual standard error: 3.602211
## Estimated effects may be unbalanced
library(agricolae)
groupslist <- list()

for(key in names(a)) {
  print(key)
  groupslist[[key]] <- HSD.test(a[[key]], 'Type')$groups|> 
    as_tibble(rownames = 'Type') |> 
    select(-`Alpha diversity index`)
}
## [1] "30%"
## [1] "50%"
## [1] "70%"
groups_df <- bind_rows(groupslist, .id='Database')
sixteen_s_exclusions_fig <- spec_summarised |> 
  filter(Type %in% c('BLAST100', 'BLAST97', 'Kraken_0.0', 'Metabuli',   'Kraken_0.1','MMSeqs2', 'Mothur')) |> 
  left_join(groups_df, by = c('Database', 'Type')) |> 
  ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) + 
  geom_boxplot(outlier.shape=NA) +
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE) + 
  facet_wrap(~Database) + 
  geom_text(aes(x = Type, y = max(`Alpha diversity index`) + 2, label = groups),
            #col = 'black',
            size = 4) +
  #coord_flip() + 
  theme_minimal() +
  theme(axis.text.x = element_text( angle = 90, hjust = 1)) +
  guides(fill="none")
sixteen_s_exclusions_fig

sixteen_spec_summarised <- spec_summarised
sixteen_spec_summarised$gene <- '16S'

CO1 exclusion databases

co1_exclusions <- data |> filter(str_starts(Subject, 'c01_v03_final.fasta_Taxonomies.')) |> 
  filter(Query == 'make_12s_16s_simulated_reads_5-BetterDatabaseARTSimulation_runEDNAFLOW_CO1_RESULTS_dada2_asv.fa')
table(co1_exclusions$Subject)
## 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_1.fasta 
##                                                                   1157 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_10.fasta 
##                                                                   1181 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_2.fasta 
##                                                                   1159 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_3.fasta 
##                                                                   1208 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_4.fasta 
##                                                                   1150 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_5.fasta 
##                                                                   1207 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_6.fasta 
##                                                                   1194 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_7.fasta 
##                                                                   1224 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_8.fasta 
##                                                                   1151 
##    c01_v03_final.fasta_Taxonomies.CountedFams.txt_fifty_subset_9.fasta 
##                                                                   1196 
##  c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_1.fasta 
##                                                                   1269 
## c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_10.fasta 
##                                                                   1241 
##  c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_2.fasta 
##                                                                   1237 
##  c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_3.fasta 
##                                                                   1212 
##  c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_4.fasta 
##                                                                   1253 
##  c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_5.fasta 
##                                                                   1245 
##  c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_6.fasta 
##                                                                   1208 
##  c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_7.fasta 
##                                                                   1254 
##  c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_8.fasta 
##                                                                   1266 
##  c01_v03_final.fasta_Taxonomies.CountedFams.txt_seventy_subset_9.fasta 
##                                                                   1248 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_1.fasta 
##                                                                   1155 
##  c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_10.fasta 
##                                                                   1176 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_2.fasta 
##                                                                   1158 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_3.fasta 
##                                                                   1101 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_4.fasta 
##                                                                   1100 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_5.fasta 
##                                                                   1132 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_6.fasta 
##                                                                   1154 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_7.fasta 
##                                                                   1115 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_8.fasta 
##                                                                   1116 
##   c01_v03_final.fasta_Taxonomies.CountedFams.txt_thirty_subset_9.fasta 
##                                                                   1109
co1_exclusions_split <- co1_exclusions |> separate(Subject, into = c('before', 'hit'), sep='.txt_') |> 
  # get rid of leftover non-subsetted databases
  filter(!is.na(hit)) |> 
  separate(hit, into=c('Database', 'after'), sep='_subset')
co1_exclusions_split_averaged <- co1_exclusions_split |> left_join(CO1_truth, by = c('OTU' = 'True_OTU')) |>
  separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|> 
  mutate(species = na_if(species, 'dropped')) |> 
  mutate(genus = na_if(genus, 'dropped')) |> 

  mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
                              !is.na(species) & True_species != species ~ 'Incorrect species',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
                              !is.na(family) & True_family == family ~ 'Correct family',
                              !is.na(family) & True_family != family ~ 'Incorrect family',
                              TRUE ~ NA)) |> 
  group_by(Type, Database, after) |>
  summarise(TP = sum(str_detect(CorrectSpecies, pattern='Correct species'), na.rm=TRUE),
            FP = sum(str_detect(CorrectSpecies, pattern = 'Incorrect species'), na.rm=TRUE),
            TN = sum(str_detect(replace_na(CorrectSpecies,'NA'), pattern='NA') & is.na(True_species), na.rm=TRUE),
            FN = sum(is.na(species) & !is.na(True_species))) |> 
  mutate(sums = TP + FP + TN + FN) |> 
  mutate(missing = 99 - sums) |> 
  mutate(FN = FN + missing) |> 
  mutate(sums = TP + FP + TN + FN) |> 
  select(-c(missing, sums)) |> 
  group_by(Type, Database) |> 
  summarise(mean_TP = mean(TP),
            mean_FP = mean(FP),
            mean_TN = mean(TN),
            mean_FN = mean(FN)) |> 
  rowwise() |> 
  mutate(Recall = recall(mean_TP, mean_FN), 
         Precision = precision(mean_TP, mean_FP),
         f1 = f1(Precision, Recall),
         f0.5 = f0.5(Precision, Recall),
         accuracy = accuracy(mean_TP, mean_FP, mean_FN, mean_TN)) 
## `summarise()` has grouped output by 'Type', 'Database'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.
co1_exclusions_split_averaged <- co1_exclusions_split_averaged |> 
  mutate(Database = case_when ( Database == 'fifty' ~ '50%',
                               Database == 'seventy' ~ '30%',
                               Database == 'thirty' ~ '70%',
                               TRUE ~ Database))
f1_pl <- co1_exclusions_split_averaged |> 
  ggplot(aes(x = Type, y = f1, group = Database, color = Database, fill = Database)) + 
  geom_point() +
  geom_line() +
  theme_minimal()
f0.5_pl <- co1_exclusions_split_averaged |> 
  ggplot(aes(x = Type, y = f0.5, group = Database, color = Database, fill = Database)) + 
  geom_point() +
  geom_line() +
  theme_minimal()
Precision_pl <- co1_exclusions_split_averaged |> 
  ggplot(aes(x = Type, y = Precision, group = Database, color = Database, fill = Database)) + 
  geom_point() +
  geom_line() +
  theme_minimal()
Recall_pl <- co1_exclusions_split_averaged |> 
  ggplot(aes(x = Type, y = Recall, group = Database, color = Database, fill = Database)) + 
  geom_point() +
  geom_line() +
  theme_minimal()
(f1_pl / f0.5_pl / Precision_pl / Recall_pl) +  plot_layout(guides = 'collect')

Lets zero in on the Precision and make boxplots with jitter dots

un_summarised_co1 <- co1_exclusions_split |> left_join(CO1_truth, by = c('OTU' = 'True_OTU')) |>
  separate(True_species, into = c('True_Genus', 'True_Epiteth'), remove = FALSE)|> 
  mutate(species = na_if(species, 'dropped')) |> 
  mutate(genus = na_if(genus, 'dropped')) |> 

  mutate(CorrectSpecies = case_when(!is.na(species) & True_species == species ~ 'Correct species',
                              !is.na(species) & True_species != species ~ 'Incorrect species',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus == genus ~ 'Correct genus',
                              !is.na(genus) & !is.na(True_Genus) & True_Genus != genus ~ 'Incorrect genus',
                              !is.na(family) & True_family == family ~ 'Correct family',
                              !is.na(family) & True_family != family ~ 'Incorrect family',
                              TRUE ~ NA)) |> 
  group_by(Type, Database, after) |>
  summarise(TP = sum(str_detect(CorrectSpecies, pattern='Correct species'), na.rm=TRUE),
            FP = sum(str_detect(CorrectSpecies, pattern = 'Incorrect species'), na.rm=TRUE),
            TN = sum(str_detect(replace_na(CorrectSpecies,'NA'), pattern='NA') & is.na(True_species), na.rm=TRUE),
            FN = sum(is.na(species) & !is.na(True_species))) |> 
  mutate(sums = TP + FP + TN + FN) |> 
  mutate(missing = 99 - sums) |> 
  mutate(FN = FN + missing) |> 
  mutate(sums = TP + FP + TN + FN) |> 
  select(-c(missing, sums)) |> 
  rowwise() |> 
  mutate(Recall = recall(TP, FN), 
         Precision = precision(TP, FP),
         f1 = f1(Precision, Recall),
         f0.5 = f0.5(Precision, Recall),
         accuracy = accuracy(TP, FP, FN, TN)) |> 
  mutate(Database = case_when ( Database == 'fifty' ~ '50%',
                               Database == 'seventy' ~ '30%',
                               Database == 'thirty' ~ '70%',
                               TRUE ~ Database))
## `summarise()` has grouped output by 'Type', 'Database'. You can override using
## the `.groups` argument.
un_summarised_co1 |> group_by(Type, Database) |> mutate(best = max(mean(Precision, na.rm=TRUE))) |>
  ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database),  color=Database, y = Precision)) +
  geom_boxplot(outlier.shape = NA) +
  coord_flip() +
  theme_minimal() +
  xlab('Type') +
  ylab('Precision') +
  geom_point(position = position_jitterdodge(), alpha=0.5)

un_summarised_co1  |> group_by(Type, Database) |> mutate(best = max(mean(f0.5, na.rm=TRUE))) |>
  ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = f0.5)) +
  geom_boxplot(outlier.shape = NA) + 
  coord_flip() + 
  theme_minimal() + 
  xlab('Type') +
  ylab('f0.5') +
  ylim(c(0, 1)) +
  geom_point(position = position_jitterdodge(), alpha=0.5)

un_summarised_co1  |> group_by(Type, Database) |> mutate(best = max(mean(Recall))) |>
  ggplot(aes(x = fct_reorder(Type, best), group = interaction(Type, Database), color=Database, y = Recall)) +
  geom_boxplot(outlier.shape = NA) + 
  coord_flip() + 
  theme_minimal() + 
  xlab('Type') +
  ylab('f0.5') +
  ylim(c(0, 1)) +
  geom_point(position = position_jitterdodge(), alpha=0.5)

un_summarised_co1  |> 
  filter(Type %in% c('BLAST100', 'Kraken_0.0',  'Metabuli',  'Mothur')) |> 
  ggplot(aes(x=Database, y = Precision, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) + 
  geom_boxplot() +
  labs(fill='Type') + 
  ylab('Precision') + 
  theme_minimal()

false_positives <- un_summarised_co1  |> 
  filter(Type %in% c('BLAST100', 'Kraken_0.0',  'Metabuli', 'Kraken_0.1', 'MMSeqs2')) |> 
  ggplot(aes(x=Database, y = FP/99*100, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) + 
  geom_boxplot(outlier.shape=NA) +
  labs(fill='Type') + 
  ylab('False positives (%)') + 
  geom_point(aes(color=Type), 
             position = position_jitterdodge(jitter.width = 0.2), 
             alpha=0.5,
             show.legend = FALSE)+
  theme_minimal()
false_positives

true_positives <- un_summarised_co1  |> 
  filter(Type %in% c('BLAST100', 'Kraken_0.0',  'Metabuli', 'Kraken_0.1', 'MMSeqs2')) |> 
  ggplot(aes(x=Database, y = TP/99*100, fill=Type)) + #fill=factor(Database, levels=c('30%','50%','70%')))) + 
  geom_boxplot(outlier.shape=NA) +
  labs(fill='Type') + 
  ylab('True positives (%)') + 
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE)+
  theme_minimal()
true_positives

false_positives/ true_positives + plot_layout(guides = 'collect') & coord_flip()

### Phylogenetic diversity

We can also easily calculate alpha diversity across these tools, as alpha diversity is the number of species. We treat classifiers/Types as sites.

spec_summarised <- co1_exclusions_split |> 
  group_by(Type, Query, Database, after) |> 
  mutate(Database = case_when ( Database == 'fifty' ~ '50%',
                               Database == 'seventy' ~ '30%',
                               Database == 'thirty' ~ '70%',
                               TRUE ~ Database)) |> 
  filter(!is.na(species)) |> 
  summarise(`Alpha diversity index` = length(unique(species)))
## `summarise()` has grouped output by 'Type', 'Query', 'Database'. You can
## override using the `.groups` argument.
spec_summarised |> 
  ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) + 
  geom_boxplot() +
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE) + 
  facet_wrap(~Database) + coord_flip() + theme_minimal()

Let’s also do not all of the classifiers

spec_summarised |> 
  filter(Type %in% c('BLAST97', 'Kraken_0.0', 'Metabuli', 'MMSeqs2_97', 'Mothur')) |> 
  ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) + 
  geom_boxplot() +
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE) + 
  facet_wrap(~Database) + coord_flip() + theme_minimal()

a <- spec_summarised |> 
  filter(Type %in% c('BLAST97', 'Metabuli', 'Kraken_0.05', 'Kraken_0.0', 'MMSeqs2_97', 'Mothur')) |> 
  group_by(Database) |> 
  arrange(Database) |> 
  group_map(~aov(`Alpha diversity index` ~ Type, data=.))

names(a) <- spec_summarised |> arrange(Database) |> pull(Database) |> unique() # I don't like this :(
a
## $`30%`
## Call:
##    aov(formula = `Alpha diversity index` ~ Type, data = .)
## 
## Terms:
##                     Type Residuals
## Sum of Squares  5882.075  1088.900
## Deg. of Freedom        3        36
## 
## Residual standard error: 5.499747
## Estimated effects may be unbalanced
## 
## $`50%`
## Call:
##    aov(formula = `Alpha diversity index` ~ Type, data = .)
## 
## Terms:
##                     Type Residuals
## Sum of Squares  1677.875   503.900
## Deg. of Freedom        3        36
## 
## Residual standard error: 3.741286
## Estimated effects may be unbalanced
## 
## $`70%`
## Call:
##    aov(formula = `Alpha diversity index` ~ Type, data = .)
## 
## Terms:
##                     Type Residuals
## Sum of Squares  14061.93    825.00
## Deg. of Freedom        5        54
## 
## Residual standard error: 3.90868
## Estimated effects may be unbalanced
library(agricolae)
groupslist <- list()

for(key in names(a)) {
  print(key)
  groupslist[[key]] <- HSD.test(a[[key]], 'Type')$groups|> 
    as_tibble(rownames = 'Type') |> 
    select(-`Alpha diversity index`)
}
## [1] "30%"
## [1] "50%"
## [1] "70%"
groups_df <- bind_rows(groupslist, .id='Database')
co1_s_exclusions_fig <- spec_summarised |> 
  filter(Type %in% c('BLAST97', 'Metabuli', 'Kraken_0.05', 'Kraken_0.0', 'MMSeqs2_97', 'Mothur')) |> 
  left_join(groups_df, by = c('Database', 'Type')) |> 
  ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) + 
  geom_boxplot(outlier.shape=NA) +
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE) + 
  facet_wrap(~Database) + 
  geom_text(aes(x = Type, y = max(`Alpha diversity index`) + 2, label = groups),
            #col = 'black',
            size = 4) +
  #coord_flip() + 
  theme_minimal() +
  theme(axis.text.x = element_text( angle = 90, hjust = 1)) +
  guides(fill="none")
co1_s_exclusions_fig

co1_spec_summarised <- spec_summarised
co1_spec_summarised$gene <- 'CO1'

12S and 16S and CO1 exclusions

both <- rbind(twelve_spec_summarised, sixteen_spec_summarised, co1_spec_summarised)
both_fig <- both |> 
  filter(Type %in% c('BLAST97', 'Metabuli', 'Kraken_0.05', 'Kraken_0.0', 'MMSeqs2_97', 'Mothur')) |> 
  left_join(groups_df, by = c('Database', 'Type')) |>
  mutate(Type = gsub('Kraken', 'Kraken2', Type)) |> 
  ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) + 
  geom_boxplot(outlier.shape=NA) +
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE) + 
  facet_wrap(~Database) + 
  geom_text(aes(x = Type, y = max(`Alpha diversity index`) + 2, label = groups),
            #col = 'black',
            size = 3) +
  #coord_flip() + 
  theme_minimal() +
  theme(axis.text.x = element_text( angle = 90, hjust = 1)) +
  guides(fill="none") +
  facet_wrap(~gene+Database, nrow = 3)
both_fig

my_save_plot(both_fig, 'exclusions_fig')

I want to remove 50%, there’s too much going on. I also want to use patchwork

top_12 <-  both |> 
  filter(gene == '12S') |> 
  filter(Type %in% c('BLAST97', 'Metabuli', 'Kraken_0.05', 'Kraken_0.0', 'MMSeqs2_97', 'Mothur')) |> 
  left_join(groups_df, by = c('Database', 'Type')) |> 
  filter(Database != '50%') |> 
  ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) + 
  geom_boxplot(outlier.shape=NA) +
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE) + 
  facet_wrap(~Database) + 
  geom_text(aes(x = Type, y = max(`Alpha diversity index`) + 2, label = groups),
            #col = 'black',
            size = 3) +
  #coord_flip() + 
  theme_minimal() +
  theme(axis.text.x = element_text( angle = 90, hjust = 1)) +
  guides(fill="none") +
  ylab('Number of identified species')+ 
  facet_wrap(~Database, nrow = 1) +
  theme(panel.spacing = unit(2, "lines")) + 
  ylim(c(10,90))

middle_16 <- both |> 
  filter(gene == '16S') |> 
  filter(Type %in% c('BLAST97', 'Metabuli', 'Kraken_0.05', 'Kraken_0.0', 'MMSeqs2_97', 'Mothur')) |> 
  left_join(groups_df, by = c('Database', 'Type')) |> 
  filter(Database != '50%') |> 
  ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) + 
  geom_boxplot(outlier.shape=NA) +
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE) + 
  facet_wrap(~Database) + 
  geom_text(aes(x = Type, y = max(`Alpha diversity index`) + 2, label = groups),
            #col = 'black',
            size = 3) +
  #coord_flip() + 
  theme_minimal() +
  theme(axis.text.x = element_text( angle = 90, hjust = 1)) +
  guides(fill="none") +
  ylab('Number of identified species')+ 
  facet_wrap(~Database, nrow = 1) +
  theme(panel.spacing = unit(2, "lines"))+ 
  ylim(c(10,90))

  
bottom_coi <- both |> 
  filter(gene == 'CO1') |> 
  filter(Type %in% c('BLAST97', 'Metabuli', 'Kraken_0.05', 'Kraken_0.0', 'MMSeqs2_97', 'Mothur')) |> 
  left_join(groups_df, by = c('Database', 'Type')) |> 
  filter(Database != '50%') |> 
  ggplot(aes(x = Type, y = `Alpha diversity index`, fill=Type, group = Type )) + 
  geom_boxplot(outlier.shape=NA) +
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE) + 
  facet_wrap(~Database) + 
  geom_text(aes(x = Type, y = max(`Alpha diversity index`) + 2, label = groups),
            #col = 'black',
            size = 3) +
  #coord_flip() + 
  theme_minimal() +
  theme(axis.text.x = element_text( angle = 90, hjust = 1)) +
  guides(fill="none") +
  ylab('Number of identified species')+
  xlab('Classifier')+ 
  facet_wrap(~Database, nrow = 1) +
  theme(panel.spacing = unit(2, "lines"))+ 
  ylim(c(10,90))

  
without50_fig <- (top_12 + theme(
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank()
  ) + ylab(NULL)) / (middle_16 +  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank()
  ) ) / (bottom_coi + ylab(NULL)) + plot_annotation(tag_levels = 'A') +
  plot_layout(guides = 'collect')


without50_fig

my_save_plot(without50_fig, 'exclusions_without50_fig')

Hmmm I don’t like this.

Let’s do it different:

without_50_redesigned_top <- both |> 
  filter(gene == '12S') |> 
  filter(Type %in% c('BLAST97', 'Metabuli', 'Kraken_0.05', 'Kraken_0.0', 'MMSeqs2_97', 'Mothur')) |> 
  left_join(groups_df, by = c('Database', 'Type')) |> 
   mutate(Type = gsub('Kraken', 'Kraken2', Type)) |> 
  filter(Database != '50%') |> 
  ggplot(aes(x = Database, y = `Alpha diversity index`, fill=Type, group = Database )) + 
  geom_boxplot(outlier.shape=NA, show.legend = FALSE) + 
  facet_wrap(~Type, nrow=1) + 
  ylab('Number of identified species') +
  xlab('Exclusion percentage') +  
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE)

without_50_redesigned_bottom <- both |> 
  filter(gene == 'CO1') |> 
  filter(Type %in% c('BLAST97', 'Metabuli', 'Kraken_0.05', 'Kraken_0.0', 'MMSeqs2_97', 'Mothur')) |> 
  left_join(groups_df, by = c('Database', 'Type')) |>
     mutate(Type = gsub('Kraken', 'Kraken2', Type)) |> 
  filter(Database != '50%') |> 
  ggplot(aes(x = Database, y = `Alpha diversity index`, fill=Type, group = Database )) + 
  geom_boxplot(outlier.shape=NA, show.legend = FALSE) + 
  facet_wrap(~Type, nrow=1) + 
  ylab('Number of identified species') +
  xlab('Exclusion percentage') +  
  geom_point(aes(color=Type), 
           position = position_jitterdodge(jitter.width = 0.2), 
           alpha=0.5,
           show.legend = FALSE) 

both_figs <- (without_50_redesigned_top + theme(
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank())) / without_50_redesigned_bottom  + plot_annotation(tag_levels = 'A') 
my_save_plot(both_figs, 'exclusions_without50_fig_redesigned')

Let’s make the tables too:

tw <- twelve_exclusions_split_averaged
tw$Gene <- '12S'
six <- sixteen_exclusions_split_averaged
six$Gene <- '16S'
co <- co1_exclusions_split_averaged
co$Gene <- 'CO1'

both <- rbind(tw, six, co) |> relocate(Gene)
both |> my_save_table('all_exclusion_databases_scores') 
both |> group_by(Gene, Database) |> arrange(desc(f1)) |> slice(1) |> ungroup() |> 
  select(Gene, Type, Database, accuracy, Precision, Recall, f1, f0.5, accuracy) |> 
  my_save_table('averaged_top1_exclusion_database_scores')